#################
#Start of script#
#################

#Project Title: "Role Theory, Non-Coercive Influence, and the Agency of Target States: The Case of Kazakhstan’s Ambassadorial Corps and the Russian Diplomatic Academy"
#Author: John C. Stanko


#List of required packages
package_list <- c("berryFunctions", "countrycode", "doBy", "dplyr", "estimatr", "ggeffects", "ggplot2",
  "jtools", "lubridate", "marginaleffects", "margins", "pROC", "readxl", "reshape2", "stargazer", "unvotes")

#Install any needed non-base packages
install.packages(package_list)

#Load required packages (in alphabetical order)
lapply(package_list, library, character.only = T)

#######################################################################

#Set working directory
#Enter your specific file directory path
setwd("/")

##Begin analysis of the ambassadorial corps dataset
#Load the ambassadorial data
dataset <- read_xlsx("FPA_Kazakhstan-RDA-Role-Theory_Full-Dataset_JCStanko.xlsx")
#Subset to only keep ambassadors in the dataset
data <- subset(dataset, ambassador == 1)

##Unreported data veracity test comparing my dataset to the contents of the Gender Diplomacy project dataset
#data2 <- read_xlsx("F:/School/Datasets/Diplomatic Data/GenDip Project/GenDip_ambassadorial-data_Kazakhstan_Russia_John_Stanko.xlsx")
#data2 <- subset(data2, cname_send == "Kazakhstan" & title == 3)
#x1 <- data.frame(unique(data2$name))
#x2 <- data.frame(unique(data$last))
#x1 <- data.frame(sort(x1$unique.data2.name., decreasing = F))
#x2 <- data.frame(sort(x2$unique.data.last., decreasing = F))
#GenDip has several duplicates because of inconsistent name formatting, but looks to be the same set of individuals


##Make sure several key variables read in as numbers
data$jan1_age <- as.numeric(data$jan1_age)
data$reg_num <- as.numeric(data$reg_num)
data$female <- as.numeric(data$female)
data$rda_edu <- as.numeric(data$rda_edu)
data$rda_edu_full <- as.numeric(data$rda_edu_full)
data$kaz_mfa_edu <- as.numeric(data$kaz_mfa_edu)
data$kaz_mfa_full <- as.numeric(data$kaz_mfa_full)
data$other_mfa_edu <- as.numeric(data$other_mfa_edu)
data$other_mfa_edu_full <- as.numeric(data$other_mfa_edu_full)
data$capital <- as.numeric(data$reg_num == 1 | data$reg_num == 2)
data$tot_amb_yrs <- as.numeric(data$tot_amb_yrs)
data$amb_yrs <- as.numeric(data$amb_yrs)

##Remove NAs from relevant categories since that data could not be located in any sources
data <- data[!is.na(data$rda_edu), ]
data <- data[!is.na(data$kaz_mfa_edu), ]
data <- data[!is.na(data$jan1_age), ]
data <- data[!is.na(data$amb_yrs), ]
data <- data[!is.na(data$female), ]
data <- data[!is.na(data$capital), ]
data <- data[!is.na(data$tot_amb_yrs), ]
data <- data[!is.na(data$amb_yrs), ]


##Add a dummy variable for whether a posting was occupied by the same individual as the prior year
data$same_prior_yr <- 0
sub <- unique(data$ccode)
for (i in 1:length(sub)) {
  j <- subset(data, ccode == sub[[i]])
  data$same_prior_yr[which(data$ccode == sub[[i]] & data$year == j$year[1])] <- 0
  if (nrow(j) > 1) {
    for (k in 2:nrow(j)) {
      data$same_prior_yr[which(data$ccode == sub[[i]] & data$year == j$year[k])] <- ifelse(j$last[k] == j$last[k-1] & j$first[k] == j$first[k-1], 1, 0)
    }
  }
}

#Create a data frame for tracking recurrent postings
prior_count <- summaryBy(same_prior_yr ~ year, data = data, fun = c("sum"))
prior_count$actual <- rep(NA, 30)
prior_count$total <- rep(NA, 30)
prior_count$all <- rep(NA, 30)
prior_count$rda <- rep(NA, 30)
prior_count$kda <- rep(NA, 30)

#Input the year-by-year data
for (i in 1:nrow(prior_count)) {
  q <- subset(data, year == prior_count$year[i] & same_prior_yr == 1)
  prior_count$actual[i] <- nrow(q)
  r <- subset(data, year == prior_count$year[i])
  prior_count$total[i] <- nrow(r)
  s <- subset(data, year == prior_count$year[i])
  prior_count$all[i] <- nrow(s)
  t <- subset(s, rda_edu == 1)
  prior_count$rda[i] <- nrow(t)
  u <- subset(s, kaz_mfa_edu == 1)
  prior_count$kda[i] <- nrow(u)
}


#Check total yearly numbers and cohort breakdowns
rus_mid_avg <- c()
final_list <- NULL
cohort_info <- NULL
cohort_info2 <- NULL
unq.amb <- c()
turnover <- data.frame()
unq.experience <- c()
corps_summarizer <- data.frame()
summarizer_hold <- c()
for (k in 1992:2021) {
  #Add year to summary statistics data frame
  summarizer_hold[1] = as.numeric(k)
  #Get the set of ambassadors each year
  y <- subset(data, year == k)
  #Add number of posts to summary statistics data frame
  summarizer_hold[2] = as.numeric(nrow(y))
  #Save the prior year's results for turnover comparison
  unq.amb.last <- unq.amb
  #Clear the list for each year
  unq.amb <- c()
  p <- c()
  for (i in 1:nrow(y)) {
    p = subset(y, last == y$last[i] & first == y$first[i])
    #Make sure the individual is not already accounted for in the case of multiple postings
    if (is.null(unq.amb)) {
      unq.amb <- rbind(unq.amb, p[nrow(p), ])
    }
    if (i > 1) {
      o <- subset(unq.amb, last == y$last[i] & first == y$first[i])
      if (nrow(o) == 0) {
        unq.amb <- rbind(unq.amb, p[nrow(p), ])
      }
    }
  }
  #Add number of unique ambassadors to summary statistics data frame
  summarizer_hold[3] = as.numeric(nrow(unq.amb))
  
  #Create a variable to track turnover within the ambassadorial corps
  if (k != 1992) {
    whip <- c(as.numeric(unq.amb.last$last %in% unq.amb$last))
    turnover <- rbind(turnover, c(k, sum(whip == 0)))
  }
  #Check how many of the unique ambassadors received RDA training
  x <- subset(unq.amb, rda_edu == 1)
  x1 <- nrow(x) / nrow(unq.amb)
  rus_mid_avg <- rbind(rus_mid_avg, c(k, x1))
  final_list <- rbind(final_list, unq.amb)
  
  #Add RDA alumni total to summary statistics data frame
  summarizer_hold[4] = sum(unq.amb$rda_edu)
  #Add RDA alumni total to summary statistics data frame
  summarizer_hold[5] = sum(unq.amb$kaz_mfa_edu)
  #Add same posting totals to summary statistics data frame
  summarizer_hold[6] = sum(unq.amb$same_prior_yr)
  
  ##If interested in the overall numbers, without regard to dual postings:
  #summarizer_hold[4] = sum(y$rda_edu)
  #summarizer_hold[5] = sum(y$kaz_mfa_edu)
  #summarizer_hold[6] = sum(y$same_prior_yr)
  
  
  ##Create comparative data frame for diplomatic training backgrounds by cohort
  x <- mean(as.numeric(unq.amb$jan1_age), na.rm = T)
  y <- subset(unq.amb, rda_edu == 1 | kaz_mfa_edu == 1 | mfa_edu_abroad == 1)
  a1 <- subset(unq.amb, birth_yr <= 1956) #Breaking down oldest cohort
  b1 <- subset(a1, other_mfa_edu == 1)
  c1 <- subset(a1, rda_edu == 1)
  d1 <- subset(a1, kaz_mfa_edu == 1)
  a2 <- subset(unq.amb, birth_yr >= 1957 & birth_yr <= 1970) #Early/mid-career cohort
  b2 <- subset(a2, other_mfa_edu == 1)
  c2 <- subset(a2, rda_edu == 1)
  d2 <- subset(a2, kaz_mfa_edu == 1)
  a3 <- subset(unq.amb, birth_yr >= 1971) #Post-Soviet cohort
  b3 <- subset(a3, other_mfa_edu == 1)
  c3 <- subset(a3, rda_edu == 1)
  d3 <- subset(a3, kaz_mfa_edu == 1)
  z <- c(x, nrow(unq.amb), nrow(a1), nrow(b1), nrow(c1), nrow(d1), nrow(a2), nrow(b2), nrow(c2), nrow(d2), nrow(a3), nrow(b3), nrow(c3), nrow(d3))
  cohort_info <- rbind(cohort_info, z)
  
  #Get disaggregated cohort info
  e1 <- subset(y, birth_yr <= 1956) #Breaking down oldest cohort
  f1 <- subset(e1, other_mfa_edu == 1)
  g1 <- subset(e1, rda_edu == 1)
  h1 <- subset(e1, kaz_mfa_edu == 1)
  e2 <- subset(y, birth_yr >= 1957 & birth_yr <= 1970) #Early/mid-career cohort
  f2 <- subset(e2, other_mfa_edu == 1)
  g2 <- subset(e2, rda_edu == 1)
  h2 <- subset(e2, kaz_mfa_edu == 1)
  e3 <- subset(y, birth_yr >= 1971) #Post-Soviet cohort
  f3 <- subset(e3, other_mfa_edu == 1)
  g3 <- subset(e3, rda_edu == 1)
  h3 <- subset(e3, kaz_mfa_edu == 1)
  z2 <- c(nrow(y), nrow(e1), nrow(f1), nrow(g1), nrow(h1), nrow(e2), nrow(f2), nrow(g2), nrow(h2), nrow(e3), nrow(f3), nrow(g3), nrow(h3))
  cohort_info2 <- rbind(cohort_info2, z2)
  
  #Update the summary statistics data frame for this year
  corps_summarizer <- rbind(corps_summarizer, summarizer_hold)
  
  #For robustness, log the years of experience among unique ambassadors only
  unq.experience <- rbind(unq.experience, mean(unq.amb$tot_amb_yrs))
}
rus_mid_avg <- data.frame(rus_mid_avg)
colnames(rus_mid_avg) <- c("year", "rda_share")
mean(rus_mid_avg[, 2]) #Overall RDA percentage is around 16.6% annually.
#summary(final_list)


#Format cohort data for ease of presentation
cohort_info <- data.frame(cohort_info)
colnames(cohort_info) <- c("Avg Age", "Tot Amb", "35+ Sum", "35+ Other", "35+ RDA", "35+ KZ", "21-34 Sum", "21-34 Other", "21-34 RDA", "21-34 KZ", "U20 Sum", "U20 Other", "U20 RDA", "U20 KZ")
cohort_info$rda_alums <- as.numeric(cohort_info$`35+ RDA` + cohort_info$`21-34 RDA` + cohort_info$`U20 RDA`)
cohort_info$rda_share <- cohort_info$rda_alums / cohort_info$`Tot Amb`
cohort_info$year <- c(1992:2021)
cohort_info$apa_alums <- as.numeric(cohort_info$`35+ KZ` + cohort_info$`21-34 KZ` + cohort_info$`U20 KZ`)

#Streamline data further before making plot
cohort_info_simple <- data.frame(cohort_info$year, cohort_info$`35+ RDA`, cohort_info$`21-34 RDA`, cohort_info$`U20 RDA`)
colnames(cohort_info_simple) <- c("Year", "35+ Cohort", "21-34 Cohort", "Under-20 Cohort")
cohort_info_simple_plot <- melt(cohort_info_simple, id.vars="Year")
colnames(cohort_info_simple_plot) <- c("Year", "Cohort", "Count")

##Plot Figure 2 breakdown of cohorts
ggplot(cohort_info_simple_plot, aes(fill = Cohort, x = Year, y = Count)) + 
  geom_bar(position = "stack", stat = "identity") +
  scale_x_continuous(breaks = round(seq(min(cohort_info_simple_plot$Year), max(cohort_info_simple_plot$Year), by = 2), 2021)) +
  scale_y_continuous(breaks = round(seq(min(cohort_info_simple_plot$Count), max(25), by = 1), 50)) +
  #scale_color_manual(values = c("35+ Cohort"="lightgoldenrod4", "21-34 Cohort"="forestgreen", "Under-20 Cohort"="steelblue")) +
  scale_color_manual(values = c("black")) +
  scale_fill_manual("Cohort", values=c("#182D1B", "#5ba300", "steelblue")) +
  ylab("Total Number") +
  xlab("Year") +
  ggtitle("Figure 2: Breakdown of RDA Alumni by Cohort") +
  theme_bw() +
  theme(panel.grid.major.x = element_blank(), panel.grid.minor.x = element_blank(),
        plot.title.position = 'plot', plot.title = element_text(hjust = 0.5))


#Overlay plots of RDA and APA alumni presence for Figure 3
plot(cohort_info$year, cohort_info$rda_alums, col = "red2", main = "Figure 3: Comparison of Ambassadors with RDA or APA Training by Year", xlab = "Year", ylab = "Total Alumni", pch = 19)
points(cohort_info$year, cohort_info$apa_alums, col = "aquamarine4", pch = 19)
legend("topleft", legend = c('RDA', 'APA'), pch = c(19, 19), col = c('red2', 'aquamarine4'))


#Data frame capturing turnover patterns
turnover_tracker <- data.frame(prior_count$year, prior_count$all, prior_count$total, prior_count$rda, prior_count$kda, prior_count$actual)
colnames(turnover_tracker) <- c("year", "total", "ambs", "rda", "kda", "repeat")
turnover1 <- insertRows(turnover, 1, new = c(1992, 0))
turnover_tracker <- cbind(turnover_tracker, turnover1$X0L)
#Add RDA alumni total to summary statistics data frame
corps_summarizer <- cbind(corps_summarizer, turnover1$X0L)
colnames(corps_summarizer) <- c("Year", "Posts", "Ambassadors", "RDA", "APA", "Same post", "Turnover")

##Plot the summary statistics of the dataset for Figure 1
corps_summarizer_plot_friendly <- melt(corps_summarizer, id.vars="Year")
ggplot(corps_summarizer_plot_friendly, aes( x=Year, y=value, colour=variable)) + 
  geom_line(linewidth = 1.83) +
  scale_x_continuous(breaks = round(seq(min(corps_summarizer_plot_friendly$Year), max(corps_summarizer_plot_friendly$Year), by = 1), 2021)) +
  scale_y_continuous(breaks = round(seq(min(corps_summarizer_plot_friendly$value), max(corps_summarizer_plot_friendly$value), by = 10), 100)) +
  scale_color_manual(values=c("Posts"="blue", "Ambassadors"="orange", "RDA"="red", "APA"="aquamarine", "Same post" = "gray", "Turnover" = "purple")) +
  xlab("Total Number") +
  ylab("Year") +
  ggtitle("Figure 1: Key Descriptive Statistics of Kazakhstani Ambassadorial Personnel") +
  theme_bw() +
  theme(panel.grid.major.x = element_blank(), panel.grid.minor.x = element_blank(),
        plot.title.position = 'plot', plot.title = element_text(hjust = 0.5))


##Final control variable is whether an individual is highly experienced, that is, above the mean number of service years
data$experienced <- as.numeric(data$tot_amb_yrs > mean(data$tot_amb_yrs))
mean(data$tot_amb_yrs) #12.19 years
#NB: Using the mean of the unique ambassadors list instead of the full list does not change the substantive results in either Model 1 or 2
data$experienced_alt <- as.numeric(data$tot_amb_yrs > mean(unq.experience))
mean(unq.experience) #9.09 years


###############################################
##Run a logit with predictive modeling to test if the KZ gvt is inducing patterns of diplomatic academy attendance
#Create a prestige variable
data$prestige <- as.numeric(data$maj_pow == 1 | data$neighbor == 1 | data$key_trade_prtnr == 1)
#Unused variable isolating education in Russia from time at the RDA
data$rus_no_rda <- as.numeric(data$rus_edu == 1 & data$rda_edu == 0)

#Classify diplomatic training backgrounds to serve as the main independent variable
data$dipl_trn <- ifelse(data$rda_edu == 0 & data$kaz_mfa_edu == 0 & data$other_mfa_edu == 0, 0, ifelse(data$rda_edu == 1 & data$kaz_mfa_edu == 0 & data$other_mfa_edu == 0, 1, ifelse(data$rda_edu == 0 & data$kaz_mfa_edu == 1 & data$other_mfa_edu == 0, 2, ifelse(data$rda_edu == 0 & data$kaz_mfa_edu == 0 & data$other_mfa_edu == 1, 3, 4))))
data$dipl_trn <- ifelse(data$dipl_trn == 0, "None", ifelse(data$dipl_trn == 1, "RDA", ifelse(data$dipl_trn == 2, "APA", ifelse(data$dipl_trn == 3, "Other", "Multi"))))
data <- subset(data, dipl_trn != "Multi")
data$dipl_trn <- relevel(factor(data$dipl_trn), ref = "None")
data$dipl_trn_strict <- ifelse(data$rda_edu_full == 0 & data$kaz_mfa_full == 0 & data$other_mfa_edu_full == 0, 0, ifelse(data$rda_edu_full == 1 & data$kaz_mfa_full == 0 & data$other_mfa_edu_full == 0, 1, ifelse(data$rda_edu_full == 0 & data$kaz_mfa_full == 1 & data$other_mfa_edu_full == 0, 2, ifelse(data$rda_edu_full == 0 & data$kaz_mfa_full == 0 & data$other_mfa_edu_full == 1, 3, 4))))
data$dipl_trn_strict <- ifelse(data$dipl_trn_strict == 0, "None", ifelse(data$dipl_trn_strict == 1, "RDA", ifelse(data$dipl_trn_strict == 2, "APA", ifelse(data$dipl_trn_strict == 3, "Other", "Multi"))))
data$dipl_trn_strict <- relevel(factor(data$dipl_trn_strict), ref = "None")#data <- subset(data, year >= 1997)
data$no_mfa_training <- as.numeric(data$dipl_trn == "None")

#Generate descriptive statistics regarding prestigious postings for Figure 4
prestige_cut <- subset(data, prestige == 1)
rda_prestige <- data.frame()
apa_prestige <- data.frame()
other_prestige <- data.frame()
none_prestige <- data.frame()
for(i in 1992:2021) {
  cut <- subset(prestige_cut, year == i)
  rda_add <- c(i, nrow(subset(cut, rda_edu == 1)))
  rda_prestige <- rbind(rda_prestige, rda_add)
  apa_add <- c(i, nrow(subset(cut, kaz_mfa_edu == 1)))
  apa_prestige <- rbind(apa_prestige, apa_add)
  other_add <- c(i, nrow(subset(cut, other_mfa_edu == 1)))
  other_prestige <- rbind(other_prestige, other_add)
  none_add <- c(i, nrow(subset(cut, no_mfa_training == 1)))
  none_prestige <- rbind(none_prestige, none_add)
}

#Convert data to long form and plot as Figure 4
prestige_data <- data.frame(rda_prestige$X1992L, rda_prestige$X0L, apa_prestige$X0L, other_prestige$X0L, none_prestige$X3L)
colnames(prestige_data) <- c("Year", "RDA Alum", "APA Alum", "Other DA", "None")
prestige_data_plot <- melt(prestige_data, id.vars="Year")
ggplot(prestige_data_plot, aes( x=Year, y=value, colour=variable)) + 
  geom_line(linewidth = 1.25) +
  geom_point(shape = 19, size = 3) +
  scale_x_continuous(breaks = round(seq(min(prestige_data_plot$Year), max(prestige_data_plot$Year), by = 1), 2021)) +
  scale_y_continuous(breaks = round(seq(min(prestige_data_plot$value), max(prestige_data_plot$value), by = 1), 20)) +
  scale_color_manual(values=c("RDA Alum"="red3", "APA Alum"="aquamarine4", "Other DA"="goldenrod", "None"="burlywood4")) +
  #scale_fill_manual(values=c("RDA Alum"="red1", "APA Alum"="aquamarine", "Other DA"="goldenrod1", "None"="burlywood1")) +
  ylab("Total Number") +
  xlab("Year") +
  ggtitle("Figure 4: Prestigious Postings by Background") +
  theme_bw() +
  theme(panel.grid.major.x = element_blank(), panel.grid.minor.x = element_blank(),
        plot.title.position = 'plot', plot.title = element_text(hjust = 0.5))

#Do not use scientific notation in coefficients
options(scipen = 999)

#For the first, primary model, the more restrictive criterion of completing an RDA or APA degree program is used
logit.01 <- glm(prestige ~ dipl_trn_strict + experienced + same_prior_yr + jan1_age + for_min + capital, family = binomial(link = "logit"), data = data)
summary(logit.01)
nobs(logit.01) #2027
exp(logit.01$coefficients)
#RDA attendance increases the likelihood of a prestigious posting by over 67%


#As requested by a reviewer, calculating the marginal effects of each variable in the model
summary(margins(logit.01))

#Plot the marginal effects (unused figure)
effect_plot(logit.01, pred = dipl_trn_strict, y.label = "Change in Predicted Probability of Prestigious Posting", x.label = "Diplomatic Training Background", colors = "black") +
  ggtitle("Average Marginal Effect of Diplomatic Training on Prestigious Postings") + theme_minimal()


#Generate a receiver operator curve for Model 1 (not used in final draft)
logit.pred.full <- predict(logit.01, type = "response")
ci.pred.full <- ci.thresholds(roc(as.numeric(data$prestige), logit.pred.full), conf.level = 0.95)
roc.any <- plot.roc(as.numeric(data$prestige), logit.pred.full,
                    main = "Model 1 Predictive Power (Prestigious Postings)", ylab = "True Positive Rate", xlab = "False Positive Rate",
                    print.auc = TRUE, print.auc.x = 0.5, print.auc.y = 0.1)
plot(ci.pred.full, length = .01 * ifelse(attr(ci.pred.full, "roc")$percent, 100, 1), col = par("fg"))

##Predicts ~61% of postings. Likely some informative variables such as social networks would improve performance, but this model still has explanatory power.


#Generate a simple table of the results for Model 1
stargazer(logit.01, type="text",
          dep.var.labels=c("Prestigious Posting"),
          #Unlike in the customized table in the manuscript, the diplomatic academies have to be alphabetized here since that is how they are ordered as factors in R
          covariate.labels=c("APA Degree", "Other Dip. Academy", "RDA Degree",
                             "Experienced","Same as Prior Year Posting", "Age on January 1", "Served as Foreign Minister", "Born in a Capital City"), out="model1.txt")


#Unreported robustness model using the alternative measure of relative experience
logit.01.01 <- glm(prestige ~ dipl_trn_strict + experienced_alt + same_prior_yr + jan1_age + for_min + capital, family = binomial(link = "logit"), data = data)
summary(logit.01.01)
#Similar coefficient and still strongly significant relationship with RDA training
nobs(logit.01.01) #2027
exp(logit.01.01$coefficients)
#RDA attendance still increases the likelihood of a prestigious posting by over 64%


#In response to a reviewer's concern about the "Other Diplomatic Academy" category creating uninformative visualizations, Model 1 was rerun with only the APA and RDA classifications
data_short <- subset(data, dipl_trn_strict != "Other")
logit.01.05 <- glm(prestige ~ dipl_trn_strict + experienced + same_prior_yr + jan1_age + for_min + capital, family = binomial(link = "logit"), data = data_short)
summary(logit.01.05)
#NB: There were no differences in the substantive findings.


#Creating Figure 5 to plot the marginal effects of the interaction between RDA attendance and level of experience on prestige of postings
plot_predictions(logit.01.05, condition = c("dipl_trn_strict", "experienced"),  x.label = "Diplomatic Training Background", y.label = "Prestigious Posting Probability") +
  ggtitle("Figure 5: Average Marginal Effect of Diplomatic Training and Experience") + theme_minimal()
#Indications are younger RDA graduates tend to get more prestigious postings than older ones.


######
#For this second model, it is not important whether the diplomat attended a training course or a complete degree program, so using the default indicators
logit.02 <- glm(prestige ~ dipl_trn + experienced + same_prior_yr + jan1_age + for_min + capital, family = binomial(link = "logit"), data = data)
summary(logit.02)
nobs(logit.02) #2027
exp(logit.02$coefficients)
#RDA training increases the likelihood of a prestigious posting by over 54%

#As requested by a reviewer, calculating the marginal effects of each variable in the model
summary(margins(logit.02))

#Plot the marginal effects (unused figure)
effect_plot(logit.02, pred = dipl_trn, y.label = "Change in Predicted Probability of Prestigious Posting", x.label = "Diplomatic Training Background", colors = "black") +
  ggtitle("Average Marginal Effect of Diplomatic Training and Experience") + theme_minimal()

#Generate a receiver operator curve to test the model's effectiveness
logit.pred.any <- predict(logit.02, type = "response")
ci.pred.any <- ci.thresholds(roc(as.numeric(data$prestige), logit.pred.any), conf.level = 0.95)
roc.any <- plot.roc(as.numeric(data$prestige), logit.pred.any,
                    main = "Model 2 Predictive Power (Prestigious Postings)", ylab = "True Positive Rate", xlab = "False Positive Rate",
                    print.auc = TRUE, print.auc.x = 0.5, print.auc.y = 0.1)
plot(ci.pred.any, length = .01 * ifelse(attr(ci.pred.any, "roc")$percent, 100, 1), col = par("fg"))
##Similar results to Model 1


#Generate a simple table of the results for Model 2
stargazer(logit.02, type="text",
          dep.var.labels=c("Prestigious Posting"),
          covariate.labels=c("APA Attendance", "Other Dip. Academy", "RDA Attendance",
                             "Experienced","Same as Prior Year Posting", "Age on January 1", "Served as Foreign Minister", "Born in a Capital City"), out="model2.txt")


#Generate a rough approximation of Table 1
hold_margins1 <- summary(margins(logit.01))[2]
hold_margins2 <- summary(margins(logit.02))[2]
#Dataset variables are not in the model alphabetically, so need to rearrange the margins output to be in the correct order
reorder <- c(8, 1, 2, 3, 4, 7, 6, 5)
models_summ_hold <- data.frame(hold_margins1, hold_margins2, reorder)
#Add a dummy row for the model intercept
models_summ_hold <- insertRows(models_summ_hold, 1, new = c(NA, NA, 0))
models_summ_hold2 <- models_summ_hold %>% arrange(reorder)
#Bring everything together
model_vars <- c("Intercept", "APA Attendance", "Other Dip. Academy", "RDA Attendance", "Experienced","Same as Prior Year Posting", "Age on January 1", "Served as Foreign Minister", "Born in a Capital City")
colnames(models_summ_hold2) <- c("M1 Margins", "M2 Margins")
table1_rough <- data.frame(model_vars, summary(logit.01)$coefficients[ , 1], summary(logit.01)$coefficients[ , 4], models_summ_hold2$`M1 Margins`, summary(logit.02)$coefficients[ , 1], summary(logit.02)$coefficients[ , 4], models_summ_hold2$`M2 Margins`)
colnames(table1_rough) <- c("Variable", "Model 1 coef", "Model 1 p-value", "Model 1 margins", "Model 2 coef", "Model 2 p-value", "Model 2 margins")
rownames(table1_rough) <- 1:nrow(table1_rough)

#View the table
table1_rough


#######################################################################

##Switching to a check of UNGA voting similarity
#Import UN voting data
votes <- un_votes %>%
  inner_join(un_roll_calls, by = "rcid")

#Reformat dates as year only
votes$date = as.Date(votes$date)
votes$year = lubridate::year(votes$date)

#Change the country code to use COW for simplicity and merging purposes if desired
votes$country_code <- countrycode(votes$country, origin = 'country.name', destination = 'cown')

#Switch vote format to numeric instead of string for ease of analysis; 1 for yes, 0 for no, and 9 for abstention
votes$vote <- ifelse(votes$vote == "yes", 1, ifelse(votes$vote == "no", 0, 9))

#Take only votes since KAZ independence and then subset for Russia and Kazakhstan
votes1 <- subset(votes, year >= 1992)
votes2 <- subset(votes1, country_code == 705 | country_code == 365)

#Summarize by vote outcome across roll call IDs, then isolate out any votes that only one of the countries participated in
dummy1 <- summaryBy(vote ~ rcid + country_code + year, data = votes2, fun = c("sum"))
dummy2 <- dummy1 %>% dplyr::count(rcid)
dummy3 <- subset(dummy2, n == 2)
usable1 <- subset(dummy1, rcid %in% dummy3$rcid)

#Split the two apart by country, then rebind them as columns
kaz <- subset(usable1, country_code == 705)
rus <- subset(usable1, country_code == 365)
rebind <- data.frame(kaz$rcid, kaz$year, kaz$vote.sum, rus$vote.sum)
colnames(rebind) <- c("rcid", "year", "kaz_vote", "rus_vote")

#Calculate vote agreement by year
rebind$agmt <- as.numeric(rebind$kaz_vote == rebind$rus_vote)
table(rebind$agmt)
sum_agmt <- summaryBy(agmt ~ year, data = rebind)
mean(rebind$agmt) * 100 #73.12 average overlap

#Run the model
un.lm.data <- data.frame(sum_agmt, rus_mid_avg[1:28, 2])
colnames(un.lm.data) <- c("Year", "Agreement", "RDA_Share")
model.un <- lm(un.lm.data$Agreement ~ un.lm.data$RDA_Share)
summary(model.un)

#Plot the results as Figure 6
plot(sum_agmt, xlab = "Year", ylab = "Percent Agreement", main = "Figure 6: Kazakhstan UNGA Voting Similarity with Russia", pch = 19, col = "blue4", type = "b")

#Unused plot of linear relationship
ggplot(un.lm.data, aes(x=RDA_Share, y=Agreement)) +
  geom_point(pch = 19, col = "darkslategray4") +
  geom_smooth(method=lm, se=FALSE, col='red3', linewidth=2) +
  xlab("Share of RDA Alumni in KZ Ambassadorial Corps") +
  ylab("Agreement on Votes") +
  ggtitle("UNGA Voting Overlap between Kazakhstan and Russia, 1992-2019", ) +
  theme(plot.title = element_text(hjust = 0.5))

#####################

##Limiting the UNGA voting to only those marked as "important" in the dataset
#Take only votes since KAZ independence and then subset for Russia and Kazakhstan
imp_votes1 <- subset(votes, importantvote == 1 & year >= 1992)
imp_votes2 <- subset(imp_votes1, country_code == 705 | country_code == 365)

#Summarize by vote outcome across roll call IDs, then isolate out any votes that only one of the countries participated in
dummy4 <- summaryBy(vote ~ rcid + country_code + year, data = imp_votes2, fun = c("sum"))
dummy5 <- dummy4 %>% dplyr::count(rcid)
dummy6 <- subset(dummy5, n == 2)
usable2 <- subset(dummy4, rcid %in% dummy6$rcid)

#Split the two apart by country, then rebind them as columns
kaz2 <- subset(usable2, country_code == 705)
rus2 <- subset(usable2, country_code == 365)
rebind2 <- data.frame(kaz2$rcid, kaz2$year, kaz2$vote.sum, rus2$vote.sum)
colnames(rebind2) <- c("rcid", "year", "kaz_vote", "rus_vote")

#Calculate vote agreement across by year
rebind2$agmt <- as.numeric(rebind2$kaz_vote == rebind2$rus_vote)
table(rebind2$agmt)
sum_agmt2 <- summaryBy(agmt ~ year, data = rebind2)
mean(rebind2$agmt) * 100 #60.75 average overlap

#Run the model for "important" votes
un.lm.data2 <- data.frame(sum_agmt2, rus_mid_avg[1:26, 2])
colnames(un.lm.data2) <- c("Year", "Agreement", "RDA_Share")
model.un2 <- lm(un.lm.data2$Agreement ~ un.lm.data2$RDA_Share)
summary(model.un2)

#Unused plot showing percentage overlap by year
plot(sum_agmt2, xlab = "Year", ylab = "Percent Agreement", main = "Kazakhstan UNGA Voting Similarity with Russia", pch = 19, col = "blue4", type = "b")


#Plot the linear regression results as Figure 7
ggplot(un.lm.data2, aes(x=RDA_Share, y=Agreement)) +
  geom_point(pch = 19, col = "darkslategray4") +
  geom_smooth(method=lm, se=FALSE, col='red3', size=2) +
  xlab("Share of RDA Alumni in KZ Ambassadorial Corps") +
  ylab("Agreement on 'Important' Votes") +
  ggtitle("Figure 7: UNGA Voting Overlap between Kazakhstan and Russia, 1992-2017", ) +
  theme(plot.title = element_text(hjust = 0.5))


###############
#End of script#
###############
